import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split as tts

"""
Variables in order:
00: CRIM     per capita crime rate by town
01: ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
02: INDUS    proportion of non-retail business acres per town
03: CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
04: NOX      nitric oxides concentration (parts per 10 million)
05: RM       average number of rooms per dwelling
06: AGE      proportion of owner-occupied units built prior to 1940
07: DIS      weighted distances to five Boston employment centres
08: RAD      index of accessibility to radial highways
09: TAX      full-value property-tax rate per $10,000
10: PTRATIO  pupil-teacher ratio by town
11: B        1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
12: LSTAT    % lower status of the population
MEDV     Median value of owner-occupied homes in $1000's
"""
raw_df = pd.read_csv('http://lib.stat.cmu.edu/datasets/boston', sep = "  ", skiprows = 22, header = None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]
for line in range(data.shape[0]):
    check_bad_separation = [' ' in elem[1:] if isinstance(elem, str) else False for elem in data[line]] # elem[1:] because one of the elements has a leading space but is fine otherwise
    if not np.any(check_bad_separation): # The features in this line have been properly separated by sep = '  '
        data[line] = data[line].astype(float)
    else:
        bad_sep_id = np.where(check_bad_separation)[0][0]
        temp_sep = np.array(data[line, bad_sep_id].split(' '), dtype = float)
        data[line] = np.concatenate(( data[line, :bad_sep_id].astype(float), temp_sep, data[line, bad_sep_id+1:10].astype(float), data[line, 11:].astype(float) ))
data = data.astype(float)

# Correcting the B variable:
b1 = 0.63 - np.sqrt(data[:, -2]/1000)
b2 = 0.63 + np.sqrt(data[:, -2]/1000)
clear_answer = b2 > 1
data[clear_answer, -2] = b1[clear_answer]
# Taken from the analysis of https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8
analyzed_idx_b1 = [102, 367, 423, 438, 445, 454, 466]
data[analyzed_idx_b1, -2] = b1[analyzed_idx_b1]
analyzed_idx_b2 = [416, 424, 426, 429, 430, 431, 432, 433, 434, 450, 455, 456, 457]
data[analyzed_idx_b2, -2] = b2[analyzed_idx_b2]
# Dropping ambiguous values
to_drop_idx = [155, 156, 410, 411, 412, 414, 415, 417, 418, 419, 425, 427, 428, 435, 436, 437]
data = np.delete(data, to_drop_idx, 0)
target = np.delete(target, to_drop_idx, 0)

# Merging the X and y data; separating into train and test
data = np.hstack((data, target.reshape(-1, 1) ))
train, test = tts(data, test_size = 0.2, random_state = 123)
np.savetxt('Boston_housing_train.csv', train, delimiter = ',', comments = '')
np.savetxt('Boston_housing_test.csv', test, delimiter = ',', comments = '')
